import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
#from google.colab import files #upload one file at a time (hours and days)
#uploaded = files.upload()
#ls
hour=pd.read_csv('hour.csv')
day=pd.read_csv('day.csv')
hour.head()
day.head()
hour.info()
hour.shape
day.info()
day.shape
hour.describe()
day.describe()
print(hour.isna().sum()/len(hour)*100)
print(day.isna().sum()/len(day)*100)
fig, ax = plt.subplots()
hr_cnt = hour.groupby(['hr']).cnt.agg({'max','min','mean','sum'})
print (hr_cnt.sort_values(by=['sum'],ascending=False))
hr_cnt['sum'].plot()
ax.set_xlabel('Hour')
ax.set_ylabel('Count')
hr_mnth = hour.groupby(['mnth']).cnt.agg({'max','min','mean','sum'})
print (hr_mnth.sort_values(by=['sum'],ascending=False))
hr_mnth['sum'].plot()
hr_weekday = hour.groupby(['weekday'])['cnt'].sum()
print (hr_weekday.sort_values(ascending=False))
hr_weekday.plot()
#season (1:winter, 2:spring, 3:summer, 4:fall)
hr_season = hour.groupby(['season'])['cnt'].sum()
print (hr_season.sort_values(ascending=False))
hr_season.plot()
hour.iloc[hour['cnt'].idxmax()]
day.iloc[day['cnt'].idxmax()]
hour[hour['dteday']=='2012-09-15']
round(hour['registered'].sum()/hour['cnt'].sum()*100,2)
round(hour['casual'].sum()/hour['cnt'].sum()*100,2)
#The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
hour['temp'] = (hour['temp']*47)-8
print (hour['temp'].min())
print(hour['temp'].max())
#atemp: Normalized feeling temperature in Celsius. The values are derived
#via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
hour['atemp'] = (hour['atemp']*66)-16
print (hour['atemp'].min())
print(hour['atemp'].max())
#Hum*100
hour['hum']=hour['hum']*100
#Windspeed *76
hour['windspeed']=hour['windspeed']*76
#Droping unused columns
#Holiday and Working day are the sam we will drop Holiday
#Casual and registered will not be used in this stude
hour.drop(['holiday','casual','registered'],axis=1,inplace=True)
#coorelation
corr = hour.corr()
print (corr)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
mask = np.triu(np.ones_like(corr, dtype=np.bool))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Dataset featurs' Correlation ", fontsize =10)
fig,ax = plt.subplots()
fig.set_size_inches(18, 8)
sns.pointplot(data=hour[['hr','cnt','season']],
x='hr',
y='cnt',
hue='season',
ax=ax)
ax.set(title="Season - hourly distribution of counts",xlabel='Hour',ylabel='Total Count')
fig,ax = plt.subplots()
fig.set_size_inches(18, 8)
sns.pointplot(data=hour[['hr',
'cnt',
'weekday']],
x='hr',
y='cnt',
hue='weekday',
ax=ax)
ax.set(title="Weekday - hourly distribution of counts",xlabel='Hour',ylabel='Total Count')
#Checking for outliners
fig,(ax1,ax2) = plt.subplots(ncols=2)
fig.set_size_inches(18, 8)
sns.boxplot(data=hour['cnt'],ax=ax1)
sns.boxplot(data=hour[['temp','windspeed']],ax=ax2)
hour.columns
hour.index=hour['dteday']
Wikipedia Definition The Z-score is the signed number of standard deviations by which the value of an observation or data point is above the mean value of what is being observed or measured. The intuition behind Z-score is to describe any data point by finding their relationship with the Standard Deviation and Mean of the group of data points. Z-score is finding the distribution of data where mean is 0 and standard deviation is 1 i.e. normal distribution. You must be wondering that, how does this help in identifying the outliers? Well, while calculating the Z-score we re-scale and center the data and look for data points which are too far from zero. These data points which are way too far from zero will be treated as the outliers. In most of the cases a threshold of 3 or -3 is used i.e if the Z-score value is greater than or less than 3 or -3 respectively, that data point will be identified as outliers. We will use Z-score function defined in scipy library to detect the outliers.
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(hour))
print(z)
hour_o = hour[(z < 3).all(axis=1)]
#lets check the shape
print ('Original Dataset',hour.shape,'\n','After removing outliers ',hour_o.shape,'\n','Difference is ',hour.shape[0]-hour_o.shape[0])
fig,ax = plt.subplots()
fig.set_size_inches(18, 8)
sns.boxplot(data=hour[['cnt',
'hr']],x='hr',y='cnt',ax=ax)
ax.set(title="Checking for outliners in day hours",xlabel='Hour',ylabel='Total Count')
sns.lmplot('temp','cnt',row='workingday',col='season',data=day,palette='RdBu_r',fit_reg=True)
G1= hour
G1['workingday'] = np.where(G1['workingday'] == '0', 'Not Working Day', G1['workingday'])
G1['workingday'] = np.where(G1['workingday'] =='1', 'Working Day', G1['workingday'])
g = sns.catplot(x="hr", y="cnt",
hue="workingday", col="season",
data=G1, kind="bar",
height=10, aspect=1)
G2= hour
G2['holiday'] = np.where(G1['holiday'] == '0', 'Not a Holiday', G2['holiday'])
G2['holiday'] = np.where(G1['holiday'] =='1', 'Holiday', G2['holiday'])
g = sns.catplot(x="hr", y="cnt",
hue="holiday", col="season",
data=G2, kind="bar",
height=10, aspect=1)
#not needed for the project
hour.drop('instant',axis=1,inplace=True)
plt.figure(figsize=(20,5))
mask = np.zeros_like(hour.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(hour.corr(),cmap='RdBu_r',mask=mask, annot=True)
MOD_READY = hour
MOD_READY.columns
print(MOD_READY.columns)
MOD_READY.index= MOD_READY['dteday']
del MOD_READY['dteday']
MOD_READY = pd.get_dummies(MOD_READY, columns=['hr','season','workingday','weathersit'],drop_first=False)
MOD_READY.info()
MOD_READY
MOD_READY_CORR = MOD_READY.corr()
MOD_READY_CORR['cnt']
print(*MOD_READY.columns)
MOD_READY.rename(columns={'workingday_Working Day':'workingday_1'}, inplace=True)
MOD_READY.mean()
### STATSMODELS ###
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
# create a fitted model
ML = smf.ols(formula='cnt ~ hr_0 + hr_1 +hr_2+ hr_3+ hr_4 +hr_5 +hr_6+ hr_7+ hr_8+ hr_9+ hr_10+ hr_11+ hr_12+ hr_13+ hr_14+ hr_15+ hr_16+ hr_17 +hr_18 +hr_19+ hr_20+ hr_21+ hr_22+ hr_23 + season_1 + season_2 + season_3 + season_4 + workingday_0 + workingday_1 + weathersit_1 + weathersit_2 + weathersit_3+ weathersit_4 + temp + atemp + hum + windspeed', data=MOD_READY).fit()
# print the coefficients
ML.params
print(ML.summary())
ML.predict({'hr_0':[0],'hr_1':[0], 'hr_2':[0],'hr_3':[0], 'hr_4':[0], 'hr_5':[0], 'hr_6':[0], 'hr_7':[0], 'hr_8':[0], 'hr_9':[0], 'hr_10':[0],
'hr_11':[0], 'hr_12':[1], 'hr_13':[0], 'hr_14':[0], 'hr_15':[0], 'hr_16':[0], 'hr_17':[0], 'hr_18':[0],
'hr_19':[0], 'hr_20':[0], 'hr_21':[0], 'hr_22':[0], 'hr_23':[0],'season_1':[0],'season_2':[1],'season_3':[0],'season_4':[0]
,'workingday_0':[1],'workingday_1':[0],'weathersit_1':[1],'weathersit_2':[0],'weathersit_3':[0],'weathersit_4':[0],'temp':[15.35],'atemp':[15.4] , 'hum':[62.7] , 'windspeed':[14.44] })
# create X and y
feature_cols = [ 'temp','hr_0','hr_1', 'hr_2',
'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10',
'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18',
'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23','season_1' ,'season_2', 'season_3',
'season_4', 'workingday_0','workingday_1', 'weathersit_1','weathersit_2', 'weathersit_3','atemp', 'hum' , 'windspeed',
'weathersit_4']
X_ML2 = MOD_READY[feature_cols]
y_ML2 = MOD_READY.cnt
# instantiate and fit
ML2 = LinearRegression()
ML2.fit(X_ML2, y_ML2)
# print the coefficients
print (ML2.intercept_)
print (ML2.coef_)
### STATSMODELS ###
# you have to create a DataFrame since the Statsmodels formula interface expects it
X_new = pd.DataFrame({ 'temp':[0.26],'hr_0':[0],'hr_1':[0], 'hr_2':[0],'hr_3':[0], 'hr_4':[0], 'hr_5':[0], 'hr_6':[0], 'hr_7':[0], 'hr_8':[0], 'hr_9':[0], 'hr_10':[0],
'hr_11':[0], 'hr_12':[1], 'hr_13':[0], 'hr_14':[0], 'hr_15':[0], 'hr_16':[0], 'hr_17':[0], 'hr_18':[0],
'hr_19':[0], 'hr_20':[0], 'hr_21':[0], 'hr_22':[0], 'hr_23':[0],'season_1':[0] ,'season_2':[1], 'season_3':[0],
'season_4':[0], 'workingday_0':[1],'workingday_1':[0], 'weathersit_1':[1],'weathersit_2':[0], 'weathersit_3':[0],'weathersit_4':[0],'atemp':[15.4] , 'hum':[62.7] , 'windspeed':[14.44]})
# predict for a new observation
ML2.predict(X_new)
x_ML2 = ML2.predict(X_ML2)
MOD_READY['cnt_predect']=x_ML2
MOD_READY.tail()
Model_Plot = MOD_READY.pivot_table(index=['mnth','hr_12'] , margins=False ,values=['cnt', 'cnt_predect'],aggfunc=np.mean)
Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
plt.ylabel('Data')
#using train test split
feature_cols = ['hr_0','hr_1', 'hr_2',
'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10',
'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18',
'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23','season_1', 'season_2', 'season_3',
'season_4', 'workingday_1', 'weathersit_1','weathersit_2', 'weathersit_3','atemp', 'hum', 'windspeed',
'weathersit_4']
X = MOD_READY[feature_cols]
y = MOD_READY.cnt
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)
# fit a model
from sklearn.metrics import mean_squared_error
from math import sqrt
lm = linear_model.LinearRegression()
model_FIT = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
print ('Score:', model_FIT.score(X_test, y_test))
y_pred = lm.predict(X_test)
print('RMSE: %.2f' % sqrt(mean_squared_error(y_test, y_pred)))
feature_cols = ['hr','season']
X_N = hour[feature_cols]
y_N = hour.cnt
# create training and testing vars
X_train_N, X_test_N, y_train_N, y_test_N = train_test_split(X_N, y_N, test_size=0.2)
print (X_train_N.shape, y_train_N.shape)
print (X_test_N.shape, y_test_N.shape)
lm_N = linear_model.LinearRegression()
model_N = lm_N.fit(X_train_N, y_train_N)
predictions_N = lm_N.predict(X_test_N)
print ('Score:', model_N.score(X_test_N, y_test_N))
print ('Score:', model_N.score(X_train_N, y_train_N))
y_pred_N = model_N.predict(X_test_N)
print('RMSE: %.2f' % sqrt(mean_squared_error(y_test_N, y_pred_N)))
scaled_hour=hour
#df.index = df['dteday']
#Normalizing the hour columns
from sklearn.preprocessing import StandardScaler
cols_to_norm = ['hr']
scaled_hour[cols_to_norm] = StandardScaler().fit_transform(scaled_hour[cols_to_norm])
#Feature Engineering for the categorical columns
scaled_hour = pd.get_dummies(scaled_hour, columns=['season','workingday','weathersit'],drop_first=False)
scaled_hour.head()
scaled_hour.columns
scaled_hour.rename(columns={'workingday_Working Day':'workingday_1'}, inplace=True)
# Model With scaled Hours and categorical features
feature_cols = ['hr','season_1', 'season_2', 'season_3','season_4', 'workingday_1',
'weathersit_1','weathersit_2', 'weathersit_3','temp','atemp','hum','windspeed']
X_scaled_hour = scaled_hour[feature_cols]
y_scaled_hour = scaled_hour.cnt
# create training and testing vars
X_train_Scaled, X_test_Scaled, y_train_Scaled, y_test_Scaled = train_test_split(X_scaled_hour, y_scaled_hour, test_size=0.2)
print (X_train_Scaled.shape, y_train_Scaled.shape)
print (X_test_Scaled.shape, y_test_Scaled.shape)
lm_scaled_hour = linear_model.LinearRegression()
model_scaled_hour = lm_scaled_hour.fit(X_train_Scaled, y_train_Scaled)
predictions_scaled_hour = lm_scaled_hour.predict(X_test_Scaled)
print ('Score:', model_scaled_hour.score(X_train_Scaled, y_train_Scaled))
y_pred_Scaled = model_scaled_hour.predict(X_test_Scaled)
print('RMSE: %.2f' % sqrt(mean_squared_error(y_test_Scaled, y_pred_Scaled)))
#Hour Normalized model
scaled_Hours_Model_Score = model_scaled_hour.score(X_train_Scaled, y_train_Scaled)
scaled_Hours_Model_RMSE = sqrt(mean_squared_error(y_test_Scaled, y_pred_Scaled))
#train test split Model score
FIT_Model_Score = model_FIT.score(X_test, y_test)
FIT_Hours_Model_RMSE= sqrt(mean_squared_error(y_test, y_pred))
#Model Without feature engineering
Model_NO_FEATURE_Score =model_N.score(X_train_N, y_train_N)
y_pred_N = model_N.predict(X_test_N)
Hours_NO_FEATURE_SModel_RMSE =sqrt(mean_squared_error(y_test_N, y_pred_N))
Model_Scores = pd.DataFrame([ ['Hour Normalized model', scaled_Hours_Model_Score,scaled_Hours_Model_RMSE],['Train test split Model',
FIT_Model_Score,FIT_Hours_Model_RMSE ],
['Model Without feature engineering',Model_NO_FEATURE_Score,Hours_NO_FEATURE_SModel_RMSE ]], columns=['Model', 'Score', 'RMSE'])
Model_Scores
#Random Forest
from sklearn.model_selection import train_test_split
feature_cols = ['hr_0','hr_1', 'hr_2',
'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10',
'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18',
'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23','season_1', 'season_2', 'season_3',
'season_4', 'workingday_1', 'weathersit_1','weathersit_2', 'weathersit_3','temp','atemp','hum','windspeed',
'weathersit_4']
X_R = MOD_READY[feature_cols]
y_R = MOD_READY.cnt
# create training and testing vars
X_train_R, X_test_R, y_train_R, y_test_R = train_test_split(X_R, y_R, test_size=0.2, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_R = sc.fit_transform(X_train_R)
X_test_R = sc.transform(X_test_R)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train_R, y_train_R)
y_pred_R = regressor.predict(X_test_R)
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_R, y_pred_R))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_R, y_pred_R))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_R, y_pred_R)))
x_R_P = regressor.predict(X_R)
MOD_READY['cnt_predect_R']=x_R_P
MOD_READY.columns
for col in MOD_READY.filter(regex='hr').columns:
Model_Plot = MOD_READY.pivot_table(index=['mnth',col] , margins=False ,values=['cnt', 'cnt_predect','cnt_predect_R'],aggfunc=np.mean)
Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
plt.ylabel('Data')
for col in MOD_READY.filter(regex='hr').columns:
Model_Plot = MOD_READY.pivot_table(index=['mnth',col] , margins=False ,values=['cnt', 'cnt_predect','cnt_predect_R'],aggfunc=np.max)
Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
plt.ylabel('Data')
for col in MOD_READY.filter(regex='hr').columns:
Model_Plot = MOD_READY.pivot_table(index=['mnth',col] , margins=False ,values=['cnt', 'cnt_predect','cnt_predect_R'],aggfunc=np.min)
Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
plt.ylabel('Data')